#!/usr/bin/env python
# -*- coding:utf-8 -*-

import os
import json

data_path = os.path.expanduser("~/data/corpus_news.json")

def split2sent(word_list):
    sent = []
    doc = []
    for w in word_list:
        sent.append(w)
        if w in ['!', "！", "。", "?", "？"]:
            doc.append(sent)
            sent = []
    if sent :
        doc.append(sent)
    return doc

outf = open("./sents.txt", 'w')

def papre_sentents():
    f = open(data_path)
    for _, line in enumerate(f):
        line = line.strip("\n")
        if not line:
            continue
        data = json.loads(line)
        words = data.get("word")
        sents = split2sent(words)
        for sent in sents:
            outf.write(" ".join(sent)+"\n")
        outf.write("\n")


if __name__ == '__main__':
    papre_sentents()
